# clear environment
rm(list=ls())

# packages
library(sandwich)
library(lmtest)

# load data set with topic proportions and crime numbers
dat = read.csv("stm_data.csv")

# calculate logged ratio of topic proportions
dat$topic_props_ratio_log <- log(dat$topic_prop_right + 0.5) - log(dat$topic_prop_left + 0.5)

# remove missing values in logged ratio of topic proportions 
dat = dat[!is.na(dat$topic_props_ratio_log),]

# define center-right interior minister
dat$cr_intmin <- ifelse(dat$intmin_party == "cdu" | dat$intmin_party== "csu" | dat$intmin_party=="fdp", 1,
                        ifelse(dat$intmin_party=="pro", NA, 0))

# remove those without center-right or center-left interior minister
dat <- dat[!is.na(dat$cr_intmin),]

# create decade indicator
dat$decade <- NA
dat$decade[dat$year<1960]<-1950
dat$decade[dat$year>=1960 & dat$year<1970]<-1960
dat$decade[dat$year>=1970 & dat$year<1980]<-1970
dat$decade[dat$year>=1980 & dat$year<1990]<-1980
dat$decade[dat$year>=1990 & dat$year<2000]<-1990
dat$decade[dat$year>=2000 & dat$year<2010]<-2000
dat$decade[dat$year>=2010 & dat$year<2020]<-2010
dat$decade[dat$year>=2020 & dat$year<2030]<-2020
dat <- dat[dat$year>1950,]

# get crime ratio on federal level as a separate variable
crime_fed <- dat[dat$jurisdiction=="bund",c("year", "extreme_crime_right", "extreme_crime_left")]
crime_fed <- crime_fed[!duplicated(crime_fed$year),]
names(crime_fed) <- c("year", "extreme_crime_right_fed", "extreme_crime_left_fed")
crime_fed$rwe_lwe_crime_ratio_log_fed <- log(crime_fed$extreme_crime_right_fed + 0.5) - log(crime_fed$extreme_crime_left_fed + 0.5)
dat <- merge(dat, crime_fed, by="year", all.x=T)

# ratio of crimes
dat$rwe_lwe_crime_ratio_log <- log(dat$extreme_crime_right + 0.5) - log(dat$extreme_crime_left + 0.5)

# impute crime ratio with federal
dat$rwe_lwe_crime_ratio_log_fedimp <- ifelse(is.na(dat$rwe_lwe_crime_ratio_log),
                                             dat$rwe_lwe_crime_ratio_log_fed,
                                             dat$rwe_lwe_crime_ratio_log)

# get the difference in ratios of crime and chapter length
dat$bias_ratio_fedimp <- dat$topic_props_ratio_log - dat$rwe_lwe_crime_ratio_log_fedimp

## regressions models
# position
mod_1 <- lm(topic_props_ratio_log ~ factor(cr_intmin), data=dat)
mod_2 <- lm(topic_props_ratio_log ~ factor(cr_intmin) + factor(jurisdiction), data=dat)
mod_3 <- lm(topic_props_ratio_log ~ factor(cr_intmin) + factor(jurisdiction) + factor(decade), data=dat)
mod_4 <- lm(topic_props_ratio_log ~ factor(cr_intmin) + factor(jurisdiction) + factor(year), data=dat)

mod_1_cl<-coeftest(mod_1, vcov=vcovHC(mod_1,type="HC0",cluster="id"))
mod_2_cl<-coeftest(mod_2, vcov=vcovHC(mod_2,type="HC0",cluster="id"))
mod_3_cl<-coeftest(mod_3, vcov=vcovHC(mod_3,type="HC0",cluster="id"))
mod_4_cl<-coeftest(mod_4, vcov=vcovHC(mod_4,type="HC0",cluster="id"))

# bias
mod_5 <- lm(bias_ratio_fedimp ~ factor(cr_intmin), data=dat)
mod_6 <- lm(bias_ratio_fedimp ~ factor(cr_intmin) + factor(jurisdiction), data=dat)
mod_7 <- lm(bias_ratio_fedimp ~ factor(cr_intmin) + factor(jurisdiction) + factor(decade), data=dat)
mod_8 <- lm(bias_ratio_fedimp ~ factor(cr_intmin) + factor(jurisdiction) + factor(year), data=dat)

mod_5_cl<-coeftest(mod_5, vcov=vcovHC(mod_5,type="HC0",cluster="id"))
mod_6_cl<-coeftest(mod_6, vcov=vcovHC(mod_6,type="HC0",cluster="id"))
mod_7_cl<-coeftest(mod_7, vcov=vcovHC(mod_7,type="HC0",cluster="id"))
mod_8_cl<-coeftest(mod_8, vcov=vcovHC(mod_8,type="HC0",cluster="id"))
